getwd()
## [1] "/Users/pankaj/dev/git/smu/qtw/case_study3"
setwd("/Users/pankaj/dev/git/smu/qtw/case_study3/Week_5_Materials_2")
load("data.Rda")
length(emailDFrp)
## [1] 30
plot(colSums(is.na(emailDFrp)))
df1= emailDFrp
df1$id = seq(1, length(emailDFrp$isRe))
df1 %>%
ggplot(mapping = aes(x= isRe, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= underscore, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= priority, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isInReplyTo, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= sortedRec, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= subPunc, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= multipartText, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= hasImages, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isPGPsigned, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= subSpamWords, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= noHost, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= numEnd, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isYelling, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isOrigMsg, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isDear, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isWrote, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= subExcCt, fill = isSpam))+
geom_bar()
## Warning: Removed 20 rows containing non-finite values (stat_count).
df1 %>%
ggplot(mapping = aes(x= subQuesCt, fill = isSpam))+
geom_bar()
## Warning: Removed 20 rows containing non-finite values (stat_count).
df1 %>%
ggplot(mapping = aes(x= numLines, fill = isSpam))+
geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
df1 %>%
ggplot(mapping = aes(x= numDlr, fill = isSpam))+
geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
df1 %>%
ggplot(mapping = aes(x= numAtt, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= numRec, fill = isSpam))+
geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
## Warning: Removed 282 rows containing non-finite values (stat_bin).
df1 %>%
ggplot(mapping = aes(x= hour, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= perHTML, fill = isSpam))+
geom_bar()
## Warning: position_stack requires non-overlapping x intervals
df1 %>%
ggplot(mapping = aes(x= forwards , color = isSpam))+
geom_freqpoly( binwidth= 1)
df1 %>%
ggplot(mapping = aes(x= perCaps , color = isSpam))+
geom_freqpoly(binwidth= 1 )
df1 %>%
ggplot(mapping = aes(x= bodyCharCt , color = isSpam))+
geom_freqpoly( binwidth= 100)
df1 %>%
ggplot(mapping = aes(x= avgWordLen , color = isSpam))+
geom_freqpoly(binwidth= 1 )
df1 %>%
ggplot(mapping = aes(x= subBlanks , color = isSpam))+
geom_freqpoly( binwidth= 1)
## Warning: Removed 20 rows containing non-finite values (stat_bin).
We cab see from above graphs there are some variables that dominate one type of messages more than others. For example variables isInReply to and priority have just one value in Spam messages which means that they might be useful in splitting trees in tree based model.Other attributes like subPunk and multipartText are almost equally present in both types of messages. Similarly if we see isPGsigned True then the message are almost always non spam.
## Confusion Matrix and Statistics
##
## pred.rpint1
## F T
## F 3489 3
## T 1122 60
##
## Accuracy : 0.7593
## 95% CI : (0.7468, 0.7715)
## No Information Rate : 0.9865
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0727
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.75667
## Specificity : 0.95238
## Pos Pred Value : 0.99914
## Neg Pred Value : 0.05076
## Prevalence : 0.98652
## Detection Rate : 0.74647
## Detection Prevalence : 0.74711
## Balanced Accuracy : 0.85452
##
## 'Positive' Class : F
##
We can see that accuracy of above random model is not very high so we do a grid search on all variables and different tree hights to figure out which tree will be the best suited and where we ll get optmimal results without overfitting the model.
train.control <- trainControl(
method = "repeatedcv",
number = 10, ## 10-fold CV
repeats = 3,## repeated three times
# USE AUC
summaryFunction = twoClassSummary,
classProbs = TRUE
)
tune.gridcart <- expand.grid(maxdepth = 2:15)
#tune.gridcart <- expand.grid(cp = seq(2,10))
system.time (rpartFit2 <- train(isSpam ~ . ,
data = emailDFrp,
method = "rpart2",
tuneGrid =tune.gridcart,
trControl = train.control,
metric = "ROC", na.action = na.pass
))
## user system elapsed
## 30.890 1.213 12.861
rpartFit2
## CART
##
## 9348 samples
## 29 predictor
## 2 classes: 'F', 'T'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 8413, 8414, 8414, 8413, 8414, 8412, ...
## Resampling results across tuning parameters:
##
## maxdepth ROC Sens Spec
## 2 0.7585786 0.9246642 0.5878144
## 3 0.7761748 0.9501753 0.5849152
## 4 0.7789992 0.9683019 0.5762930
## 5 0.8001320 0.9705070 0.5967329
## 6 0.9161483 0.9513262 0.7352371
## 7 0.9161483 0.9513262 0.7352371
## 8 0.9164148 0.9510385 0.7366318
## 9 0.9177516 0.9530529 0.7377586
## 10 0.9187233 0.9554506 0.7377586
## 11 0.9255864 0.9471549 0.7707078
## 12 0.9289536 0.9450444 0.7860094
## 13 0.9348707 0.9428388 0.8093579
## 14 0.9363542 0.9450928 0.8128382
## 15 0.9371638 0.9479696 0.8149262
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 15.
plot(rpartFit2)
From above grid search it looks like ROC curve area increases with number of depth but the increase is not much after max depth 6. so that one we ll choose for our final model
emailDFrp.rp <-rpart(isSpam ~ . , # formula
data = emailDFrp, # data frame
subset = train, # indices of training set
method = "class", # classification tree
parms = list(split = "information"), # use entropy/deviance
maxsurrogate = 0, # since no missing values
cp = 0, # no size penalty
minsplit = 5, # Nodes of size 5 (or # morecan be split,
minbucket = 2,
# provided each sub-node
# contains at least 2 obs.
control=rpart.control(maxdepth=6)
)
#summary(emailDFrp.rp)
plot(emailDFrp.rp,
uniform=TRUE,
compress=TRUE,
margin = .2)
text(emailDFrp.rp,
use.n=TRUE,
all = TRUE,
fancy = TRUE)
fancyRpartPlot(emailDFrp.rp, main="Decision Tree Graph")
As per the above we plot we can see first split is on variable forward. if the value is more than 5.5 another split is done on variable perCaps. We can see that 53% of total mails are in node with forward >6 and perCaps less than 6. This contains 73% of non spams and 27 % of smaps. We split this node further based on subblanks less than 24. Other nodes can be explained in same way and finally we reach leaf node.
confusion matrix for model based on all vars and max tree height 6.
pred.rpint <- predict(emailDFrp.rp,
newdata = emailDFrp[-train,],
type = "class")
#
# Classification table on the test data
#
cm = table(emailDFrp$isSpam[-train], pred.rpint)
print(confusionMatrix(cm))
## Confusion Matrix and Statistics
##
## pred.rpint
## F T
## F 3387 105
## T 490 692
##
## Accuracy : 0.8727
## 95% CI : (0.8628, 0.8821)
## No Information Rate : 0.8295
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6224
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8736
## Specificity : 0.8683
## Pos Pred Value : 0.9699
## Neg Pred Value : 0.5854
## Prevalence : 0.8295
## Detection Rate : 0.7246
## Detection Prevalence : 0.7471
## Balanced Accuracy : 0.8709
##
## 'Positive' Class : F
##
As we can see the accurace of this model is 87% on test data.
Most important variables determined by tree model are as per following attribute of model.
emailDFrp.rp$variable.importance
## forwards isInReplyTo isRe perCaps isWrote bodyCharCt
## 535.602402 365.401656 323.694079 309.072712 264.207776 214.666202
## numLines subBlanks subExcCt numRec isOrigMsg avgWordLen
## 170.442266 121.321644 74.526799 38.481899 32.934959 22.330894
## numDlr perHTML priority isPGPsigned subQuesCt
## 9.908102 5.585596 4.718144 4.303511 4.140931
We cab see the forwards , isInReplyTo and isRe etc. are among the most important variables. We found during our exploratory data analysis the similar conclusion. One type of emails were dominated by some specific variables more than other was.
# split into train and validation
library(data.tree)
createDataTree <- function(h2oTree) {
h2oTreeRoot = h2oTree@root_node
dataTree = Node$new(h2oTreeRoot@split_feature)
dataTree$type = 'split'
addChildren(dataTree, h2oTreeRoot)
return(dataTree)
}
addChildren <- function(dtree, node) {
if(class(node)[1] != 'H2OSplitNode') return(TRUE)
feature = node@split_feature
id = node@id
na_direction = node@na_direction
if(is.na(node@threshold)) {
leftEdgeLabel = printValues(node@left_levels,
na_direction=='LEFT', 4)
rightEdgeLabel = printValues(node@right_levels,
na_direction=='RIGHT', 4)
}else {
leftEdgeLabel = paste("<", node@threshold,
ifelse(na_direction=='LEFT',',NA',''))
rightEdgeLabel = paste(">=", node@threshold,
ifelse(na_direction=='RIGHT',',NA',''))
}
left_node = node@left_child
right_node = node@right_child
if(class(left_node)[[1]] == 'H2OLeafNode')
leftLabel = paste("prediction:", left_node@prediction)
else
leftLabel = left_node@split_feature
if(class(right_node)[[1]] == 'H2OLeafNode')
rightLabel = paste("prediction:", right_node@prediction)
else
rightLabel = right_node@split_feature
if(leftLabel == rightLabel) {
leftLabel = paste(leftLabel, "(L)")
rightLabel = paste(rightLabel, "(R)")
}
dtreeLeft = dtree$AddChild(leftLabel)
dtreeLeft$edgeLabel = leftEdgeLabel
dtreeLeft$type = ifelse(class(left_node)[1] == 'H2OSplitNode', 'split', 'leaf')
dtreeRight = dtree$AddChild(rightLabel)
dtreeRight$edgeLabel = rightEdgeLabel
dtreeRight$type = ifelse(class(right_node)[1] == 'H2OSplitNode', 'split', 'leaf')
addChildren(dtreeLeft, left_node)
addChildren(dtreeRight, right_node)
return(FALSE)
}
printValues <- function(values, is_na_direction, n=4) {
l = length(values)
if(l == 0)
value_string = ifelse(is_na_direction, "NA", "")
else
value_string = paste0(paste0(values[1:min(n,l)], collapse = ', '),
ifelse(l > n, ",...", ""),
ifelse(is_na_direction, ", NA", ""))
return(value_string)
}
We use random forest from h2o lib to generate other tree.
#knitr::opts_chunk$set(fig.width=12, fig.height=8)
# Build and train the model:
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 8 minutes 58 seconds
## H2O cluster timezone: Asia/Singapore
## H2O data parsing timezone: UTC
## H2O cluster version: 3.30.0.1
## H2O cluster version age: 2 months and 13 days
## H2O cluster name: H2O_started_from_R_pankaj_xen810
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.38 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.6.0 (2019-04-26)
emailDHex = as.h2o(emailDFrp)
##
|
| | 0%
|
|=================================================================| 100%
splits = h2o.splitFrame(data = emailDHex, ratios = .8, seed = 1234)
trainHex = splits[[1]]
validHex = splits[[2]]
predictors = c("isRe")
predictors = names(emailDFrp)[c(seq(2,29))]
response = c("isSpam")
emailD_2tree <- h2o.randomForest(x = predictors,
y = response,
ntrees = 5,
max_depth = 5,
min_rows = 20,
binomial_double_trees = TRUE,
training_frame = trainHex,
validation_frame = validHex)
##
|
| | 0%
|
|============= | 20%
|
|=================================================================| 100%
emailDH2oTree2 = h2o.getModelTree(model = emailD_2tree, tree_number = 2)
library(data.tree)
library(DiagrammeR)
emailDataTree = createDataTree(emailDH2oTree2)
GetEdgeLabel <- function(node) {return (node$edgeLabel)}
GetNodeShape <- function(node) {switch(node$type,
split = "diamond", leaf = "oval")}
GetFontName <- function(node) {switch(node$type,
split = 'Palatino-bold',
leaf = 'Palatino')}
SetEdgeStyle(emailDataTree, fontname = 'Palatino-italic',
label = GetEdgeLabel, labelfloat = TRUE,
fontsize = "26", fontcolor='royalblue4')
SetNodeStyle(emailDataTree, fontname = GetFontName, shape = GetNodeShape,
fontsize = "26", fontcolor='royalblue4',
height="1.75", width="2")
SetGraphStyle(emailDataTree, rankdir = "LR", dpi=70.)
dev.new(width=10, height=10)
plot(emailDataTree, output = "graph")
emailDataTree
## levelName
## 1 isYelling
## 2 ¦--hour
## 3 ¦ ¦--prediction: 0
## 4 ¦ °--bodyCharCt
## 5 ¦ ¦--prediction: 0.14285715
## 6 ¦ °--prediction: 0.03846154
## 7 °--isInReplyTo
## 8 ¦--perHTML
## 9 ¦ ¦--numEnd
## 10 ¦ ¦ ¦--forwards
## 11 ¦ ¦ ¦ ¦--prediction: 0.20247933
## 12 ¦ ¦ ¦ °--prediction: 0.9230769
## 13 ¦ ¦ °--multipartText
## 14 ¦ ¦ ¦--prediction: 0.28787878
## 15 ¦ ¦ °--prediction: 0.8250444
## 16 ¦ °--subBlanks
## 17 ¦ ¦--subBlanks
## 18 ¦ ¦ ¦--prediction: 0.03508772
## 19 ¦ ¦ °--prediction: 0.3508772
## 20 ¦ °--perHTML
## 21 ¦ ¦--prediction: 0
## 22 ¦ °--prediction: 0.12903225
## 23 °--prediction: 1
h2o.shutdown()
## Are you sure you want to shutdown the H2O instance running at http://localhost:54321/ (Y/N)?
We found almost similar variables in slightly different order as we found in our initial tree model.